Load libraries
Lecture du fichier de données
Recherche d’influence entre quelques variables explicatives avec la variable y (souscription à un service bancaire)
Transformation des variables catégorielles en variable numérique
Création des jeux de données d’entrainement et de test
Normalisation des données
Caret - downsample et upsample
Prédiction avec naive bayes
Prédiction avec naive bayes sur les données downsample
Prédiction avec SVM
Les variables prédictives les plus importantes
Load libraries
library("ggplot2")
library("plotly")
##
## Attachement du package : 'plotly'
## L'objet suivant est masqué depuis 'package:ggplot2':
##
## last_plot
## L'objet suivant est masqué depuis 'package:stats':
##
## filter
## L'objet suivant est masqué depuis 'package:graphics':
##
## layout
library("caret")
## Le chargement a nécessité le package : lattice
bank_data = read.csv("bank.csv", sep = ";")
dplyr::as_tibble(bank_data)
## # A tibble: 4,521 × 17
## age job marital educa…¹ default balance housing loan contact day month
## <int> <chr> <chr> <chr> <chr> <int> <chr> <chr> <chr> <int> <chr>
## 1 30 unem… married primary no 1787 no no cellul… 19 oct
## 2 33 serv… married second… no 4789 yes yes cellul… 11 may
## 3 35 mana… single tertia… no 1350 yes no cellul… 16 apr
## 4 30 mana… married tertia… no 1476 yes yes unknown 3 jun
## 5 59 blue… married second… no 0 yes no unknown 5 may
## 6 35 mana… single tertia… no 747 no no cellul… 23 feb
## 7 36 self… married tertia… no 307 yes no cellul… 14 may
## 8 39 tech… married second… no 147 yes no cellul… 6 may
## 9 41 entr… married tertia… no 221 yes no unknown 14 may
## 10 43 serv… married primary no -88 yes yes cellul… 17 apr
## # … with 4,511 more rows, 6 more variables: duration <int>, campaign <int>,
## # pdays <int>, previous <int>, poutcome <chr>, y <chr>, and abbreviated
## # variable name ¹education
graph1 <- ggplot(bank_data, aes(x=y, y=duration, fill=y)) + geom_boxplot()
graph1 <- graph1 + ggtitle("L'influence de le variable duration (durée de l'appel) sur la variable souscription y")
# graph1 <- graph1 + theme(plot.title = element_text(hjust = 0.5))
graph1
Nous remarquons que plus la variable duration (durée de l’appel) est grande plus il y a une probabilité que le client souscrit à un service bancaire
graph2 <- ggplot(bank_data, aes(x=y, y=age, fill=y)) + geom_boxplot()
graph2 <- graph2 + ggtitle("L'influence de le variable age sur la variable souscription y")
graph2 <- ggplotly(graph2)
graph2
Il n’y pas trop une grande différence entre la distribution d’age des souscrits (yes) et celle des non souscrits (no).Apparemment, ce n’est pas l’age qui définit si le client va souscrire à un service bancaire.
graph3 <- ggplot(bank_data, aes(y, fill=contact)) + geom_bar()
graph3 <- graph3 + ggtitle("L'influence de la variable contact sur la variable souscription y")
graph3
Peut-etre que la variable contact a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il répond avec un téléphone mobile (cellular). De meme avec un téléphone fixe (telephone).
graph4 <- ggplot(bank_data, aes(y, fill=marital)) + geom_bar()
graph4 <- graph4 + ggtitle("L'influence de la varible marital sur la variable souscription y")
graph4
Peut-etre que la variable marital a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il est divorcé.
graph5 <- ggplot(bank_data, aes(y, fill=housing)) + geom_bar()
graph5 <- graph5 + ggtitle("L'influence de la variable housing sur la variable souscription y")
graph5 <- graph5 + xlab("y (souscription)")
graph5
Peut-etre que la variable housing a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il n’a pas de logement(housing).
graph6 <- ggplot(bank_data, aes(y, fill=loan)) + geom_bar()
graph6 <- graph6 + ggtitle("L'influence de la variable loan sur la variable souscription y")
graph6 <- graph6 + xlab("y (souscription)")
graph6
Peut-etre que la variable loan a un effet sur le fait que le client va souscrire ou non à un service bancaire. On voit qu’il y a plus de chance que le client ne souscrit pas à un service bancaire s’il a fait un pret bancaire.
dummy_variables = dummyVars(~., data = bank_data)
dummy_variables_data = predict(dummy_variables, newdata = bank_data)
dummy_variables_data = as.data.frame(dummy_variables_data)
dummy_variables_data$"Souscription" = ifelse(dummy_variables_data$"yno" == 1, "No", "Yes")
dummy_variables_data$"yno" = NULL
dummy_variables_data$"yyes" = NULL
set.seed(3033)
training_size = floor(0.7*nrow(dummy_variables_data))
indices = sample(seq_len(nrow(dummy_variables_data)), size = training_size)
data_bank.train = dummy_variables_data[indices,]
data_bank.test = dummy_variables_data[-indices,]
dim(data_bank.train)
## [1] 3164 52
dim(data_bank.test)
## [1] 1357 52
data_preprocess_value = preProcess(data_bank.train, method = c("center","scale"))
data_bank.train.scaled = predict(data_preprocess_value,data_bank.train)
data_bank.test.scaled = predict(data_preprocess_value,data_bank.test)
table(data_bank.train.scaled[,"Souscription"])
##
## No Yes
## 2795 369
set.seed(3033)
'%ni%' = Negate("%in%")
# downsample
data_bank.train.scaled.downsample = downSample(x = data_bank.train.scaled[,colnames(data_bank.train.scaled) %ni% "Souscription"], y = as.factor(data_bank.train.scaled$"Souscription"))
names(data_bank.train.scaled.downsample)[names(data_bank.train.scaled.downsample) == "Class"]="Souscription"
table(data_bank.train.scaled.downsample[,"Souscription"])
##
## No Yes
## 369 369
# upsample
data_bank.train.scaled.upsample = upSample(x = data_bank.train.scaled[,colnames(data_bank.train.scaled) %ni% "Souscription"], y = as.factor(data_bank.train.scaled$"Souscription"))
names(data_bank.train.scaled.upsample)[names(data_bank.train.scaled.upsample) == "Class"]="Souscription"
table(data_bank.train.scaled.upsample[,"Souscription"])
##
## No Yes
## 2795 2795
set.seed(3033)
trainControl_data = trainControl(method = "repeatedcv", number = 10, repeats = 3)
naive_bayes_desequilibree = train(Souscription ~., data = data_bank.train.scaled, method='naive_bayes', preProcess = NULL, trControl = trainControl_data)
print(naive_bayes_desequilibree)
## Naive Bayes
##
## 3164 samples
## 51 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 2848, 2847, 2848, 2848, 2847, 2847, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.8298512 0.2903414705
## TRUE 0.8831645 0.0009392685
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = TRUE
## and adjust = 1.
prediction_naive_bayes_desequilibree = predict(naive_bayes_desequilibree, newdata = data_bank.test.scaled[,-ncol(data_bank.test.scaled)])
confusionMatrix(prediction_naive_bayes_desequilibree, as.factor(data_bank.test.scaled[,ncol(data_bank.test.scaled)]))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1205 152
## Yes 0 0
##
## Accuracy : 0.888
## 95% CI : (0.87, 0.9043)
## No Information Rate : 0.888
## P-Value [Acc > NIR] : 0.5216
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 1.000
## Specificity : 0.000
## Pos Pred Value : 0.888
## Neg Pred Value : NaN
## Prevalence : 0.888
## Detection Rate : 0.888
## Detection Prevalence : 1.000
## Balanced Accuracy : 0.500
##
## 'Positive' Class : No
##
set.seed(3033)
trainControl_data = trainControl(method = "repeatedcv", number = 10, repeats = 3)
naive_bayes_downsample = train(Souscription ~., data = data_bank.train.scaled.downsample, method = "naive_bayes", preProcess = NULL, trControl = trainControl_data)
print(naive_bayes_downsample)
## Naive Bayes
##
## 738 samples
## 51 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 664, 664, 664, 664, 664, 665, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.6882883 0.3765059
## TRUE 0.6715784 0.3429571
##
## Tuning parameter 'laplace' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were laplace = 0, usekernel = FALSE
## and adjust = 1.
prediction_naive_bayes_downsample = predict(naive_bayes_downsample, newdata = data_bank.test.scaled[,-ncol(data_bank.test.scaled)])
confusionMatrix(prediction_naive_bayes_downsample, as.factor(data_bank.test.scaled[,ncol(data_bank.test.scaled)]))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 997 46
## Yes 208 106
##
## Accuracy : 0.8128
## 95% CI : (0.791, 0.8332)
## No Information Rate : 0.888
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.358
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.8274
## Specificity : 0.6974
## Pos Pred Value : 0.9559
## Neg Pred Value : 0.3376
## Prevalence : 0.8880
## Detection Rate : 0.7347
## Detection Prevalence : 0.7686
## Balanced Accuracy : 0.7624
##
## 'Positive' Class : No
##
set.seed(3033)
trainControl_data = trainControl(method = "repeatedcv", number = 10, repeats = 3)
SVM_desequilibree = train(Souscription ~., data = data_bank.train.scaled, method = "svmLinear", preProcess = NULL)
print(SVM_desequilibree)
## Support Vector Machines with Linear Kernel
##
## 3164 samples
## 51 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 3164, 3164, 3164, 3164, 3164, 3164, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8944894 0.2971418
##
## Tuning parameter 'C' was held constant at a value of 1
prediction_SVM_desequilibree = predict(SVM_desequilibree, newdata = data_bank.test.scaled[,-ncol(data_bank.test.scaled)])
confusionMatrix(prediction_SVM_desequilibree, as.factor(data_bank.test.scaled[,ncol(data_bank.test.scaled)]))
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 1188 122
## Yes 17 30
##
## Accuracy : 0.8976
## 95% CI : (0.8802, 0.9132)
## No Information Rate : 0.888
## P-Value [Acc > NIR] : 0.1405
##
## Kappa : 0.2625
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.9859
## Specificity : 0.1974
## Pos Pred Value : 0.9069
## Neg Pred Value : 0.6383
## Prevalence : 0.8880
## Detection Rate : 0.8755
## Detection Prevalence : 0.9654
## Balanced Accuracy : 0.5916
##
## 'Positive' Class : No
##
varImp(naive_bayes_downsample, scale = F)
## ROC curve variable importance
##
## only 20 most important variables shown (out of 51)
##
## Importance
## duration 0.8297
## contactcellular 0.6098
## contactunknown 0.6043
## poutcomeunknown 0.5935
## previous 0.5914
## pdays 0.5889
## poutcomesuccess 0.5732
## campaign 0.5721
## monthmay 0.5637
## housingno 0.5623
## housingyes 0.5623
## balance 0.5559
## maritalmarried 0.5501
## loanno 0.5393
## loanyes 0.5393
## maritalsingle 0.5298
## monthapr 0.5285
## monthoct 0.5285
## jobmanagement 0.5244
## educationsecondary 0.5230